This files contains an example of tuning an XGBoost model with BayesSearchCV.
import pickle
import time
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import plotly.io as pio
pio.renderers.default='notebook'
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
hlp.pandas.numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration | 760 | 40 | 5.0% | 0 | 0.0% | 21.0 | 11.7 | 0.6 | 1.0 | 0.6 | 4.0 | 9.0 | 12.0 | 18.0 | 24.0 | 36.0 | 60.0 |
| credit_amount | 800 | 0 | 0.0% | 38 | 5.0% | 3,203.9 | 2,932.3 | 0.9 | 1.9 | 3.9 | 0.0 | 753.9 | 1,300.8 | 2,236.5 | 3,951.5 | 7,394.6 | 18,424.0 |
| installment_commitment | 800 | 0 | 0.0% | 0 | 0.0% | 3.0 | 1.1 | 0.4 | -0.5 | -1.2 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| residence_since | 800 | 0 | 0.0% | 0 | 0.0% | 2.9 | 1.1 | 0.4 | -0.3 | -1.4 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| age | 800 | 0 | 0.0% | 0 | 0.0% | 35.6 | 11.4 | 0.3 | 1.0 | 0.7 | 19.0 | 23.0 | 27.0 | 33.0 | 42.0 | 52.0 | 75.0 |
| existing_credits | 800 | 0 | 0.0% | 0 | 0.0% | 1.4 | 0.6 | 0.4 | 1.3 | 1.6 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 4.0 |
| num_dependents | 800 | 0 | 0.0% | 0 | 0.0% | 1.1 | 0.3 | 0.3 | 2.0 | 2.1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
hlp.pandas.non_numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| checking_status | 763 | 37 | 4.6% | no checking | 4 | 0.5% |
| credit_history | 800 | 0 | 0.0% | existing paid | 5 | 0.6% |
| purpose | 800 | 0 | 0.0% | radio/tv | 10 | 1.2% |
| savings_status | 800 | 0 | 0.0% | <100 | 5 | 0.6% |
| employment | 800 | 0 | 0.0% | 1<=X<4 | 5 | 0.6% |
| personal_status | 800 | 0 | 0.0% | male single | 4 | 0.5% |
| other_parties | 800 | 0 | 0.0% | none | 3 | 0.4% |
| property_magnitude | 800 | 0 | 0.0% | car | 4 | 0.5% |
| other_payment_plans | 800 | 0 | 0.0% | none | 3 | 0.4% |
| housing | 800 | 0 | 0.0% | own | 3 | 0.4% |
| job | 800 | 0 | 0.0% | skilled | 4 | 0.5% |
| own_telephone | 800 | 0 | 0.0% | none | 2 | 0.2% |
| foreign_worker | 800 | 0 | 0.0% | yes | 2 | 0.2% |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
search_space = hlp.sklearn_search.XGBoostBayesianSearchSpace(random_state=42)
# pip install scikit-optimize
from skopt import BayesSearchCV
from sklearn.model_selection import RepeatedKFold
bayes_search = BayesSearchCV(
estimator=search_space.pipeline(data=X_train),
search_spaces=search_space.search_spaces(),
cv=RepeatedKFold(n_splits=5, n_repeats=2, random_state=42),
scoring='roc_auc',
n_jobs=-1,
verbose=1,
random_state=42,
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")
Elapsed time to run BayesSearchCV: 151.414 seconds; 2.5 minutes
print(bayes_search.best_score_)
0.7660446081377409
print(bayes_search.best_params_)
OrderedDict([('model', XGBClassifier(base_score=None, booster=None, colsample_bylevel=0.5,
colsample_bynode=None, colsample_bytree=0.5,
enable_categorical=False, eval_metric='logloss', gamma=None,
gpu_id=None, importance_type=None, interaction_constraints=None,
learning_rate=0.01, max_delta_step=None, max_depth=10,
min_child_weight=1, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, reg_alpha=1.0, reg_lambda=4.0,
scale_pos_weight=None, subsample=0.5, tree_method=None,
use_label_encoder=False, validate_parameters=None,
verbosity=None)), ('model__colsample_bylevel', 0.5), ('model__colsample_bytree', 0.5), ('model__learning_rate', 0.01), ('model__max_depth', 10), ('model__min_child_weight', 1), ('model__n_estimators', 100), ('model__reg_alpha', 1.0), ('model__reg_lambda', 4.0), ('model__subsample', 0.5), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer(strategy='most_frequent')), ('prep__numeric__scaler__transformer', None)])
results = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
searcher=bayes_search,
higher_score_is_better = True,
parameter_name_mappings = search_space.param_name_mappings()
)
results.to_yaml_file(yaml_file_name = 'Run 1 - XGBoost - BayesSearchCV.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - XGBoost - BayesSearchCV.yaml')
results.best_score
0.7660446081377409
results.best_params
{'model': 'XGBClassifier()',
'max_depth': 10,
'learning_rate': 0.01,
'n_estimators': 100,
'min_child_weight': 1,
'subsample': 0.5,
'colsample_bytree': 0.5,
'colsample_bylevel': 0.5,
'reg_alpha': 1.0,
'reg_lambda': 4.0,
'imputer': "SimpleImputer(strategy='most_frequent')",
'scaler': 'None',
'encoder': 'CustomOrdinalEncoder()'}
results.to_formatted_dataframe(num_rows=100)
/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/scipy/stats/_distn_infrastructure.py:2128: RuntimeWarning: invalid value encountered in multiply /Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/scipy/stats/_distn_infrastructure.py:2129: RuntimeWarning: invalid value encountered in multiply
| roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | max_depth | learning_rate | n_estimators | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | imputer | encoder |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.766 | 0.741 | 0.791 | 10.000 | 0.010 | 100.000 | 1.000 | 0.500 | 0.500 | 0.500 | 1.000 | 4.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.764 | 0.743 | 0.785 | 1.000 | 0.010 | 2,000.000 | 3.000 | 0.500 | 1.000 | 1.000 | 0.000 | 1.974 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.763 | 0.739 | 0.787 | 10.000 | 0.010 | 100.000 | 1.000 | 0.500 | 0.500 | 0.500 | 1.000 | 4.000 | SimpleImputer() | OneHotEncoder() |
| 0.763 | 0.738 | 0.787 | 5.000 | 0.010 | 538.000 | 2.000 | 0.702 | 0.973 | 0.500 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.762 | 0.740 | 0.785 | 1.000 | 0.010 | 1,275.000 | 1.000 | 0.500 | 1.000 | 1.000 | 0.003 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.762 | 0.740 | 0.784 | 1.000 | 0.010 | 2,000.000 | 2.000 | 0.500 | 0.955 | 0.500 | 1.000 | 4.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.761 | 0.735 | 0.787 | 10.000 | 0.010 | 100.000 | 2.000 | 0.500 | 0.500 | 0.500 | 1.000 | 4.000 | SimpleImputer() | CustomOrdinalEncoder() |
| 0.761 | 0.735 | 0.787 | 10.000 | 0.010 | 266.000 | 2.000 | 0.556 | 0.795 | 0.810 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.761 | 0.738 | 0.783 | 2.000 | 0.010 | 1,281.000 | 2.000 | 0.599 | 1.000 | 0.500 | 0.000 | 1.085 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.761 | 0.740 | 0.781 | 3.000 | 0.010 | 1,569.000 | 2.000 | 0.500 | 0.500 | 0.500 | 0.000 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.759 | 0.739 | 0.779 | 2.000 | 0.010 | 1,720.000 | 2.000 | 0.755 | 1.000 | 1.000 | 1.000 | 1.998 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 0.759 | 0.734 | 0.784 | 10.000 | 0.010 | 100.000 | 3.000 | 0.500 | 0.500 | 0.500 | 1.000 | 4.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.759 | 0.734 | 0.784 | 10.000 | 0.010 | 100.000 | 3.000 | 1.000 | 0.558 | 0.500 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.757 | 0.735 | 0.779 | 1.000 | 0.010 | 1,930.000 | 2.000 | 0.830 | 1.000 | 1.000 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.756 | 0.731 | 0.782 | 2.000 | 0.010 | 467.000 | 3.000 | 0.631 | 0.775 | 0.521 | 1.000 | 4.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.756 | 0.736 | 0.775 | 1.000 | 0.010 | 2,000.000 | 5.000 | 0.500 | 1.000 | 1.000 | 0.000 | 1.661 | SimpleImputer() | CustomOrdinalEncoder() |
| 0.754 | 0.727 | 0.781 | 2.000 | 0.041 | 100.000 | 2.000 | 0.588 | 0.936 | 0.850 | 0.003 | 2.280 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.754 | 0.733 | 0.774 | 10.000 | 0.010 | 1,324.000 | 2.000 | 0.500 | 1.000 | 1.000 | 1.000 | 4.000 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.753 | 0.730 | 0.775 | 2.000 | 0.010 | 1,972.000 | 1.000 | 1.000 | 0.911 | 0.972 | 1.000 | 3.323 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 0.752 | 0.729 | 0.775 | 5.000 | 0.010 | 1,627.000 | 2.000 | 0.845 | 0.873 | 0.500 | 1.000 | 4.000 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.749 | 0.727 | 0.772 | 1.000 | 0.196 | 100.000 | 10.000 | 1.000 | 0.967 | 0.500 | 0.078 | 1.000 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.749 | 0.725 | 0.773 | 2.000 | 0.044 | 395.000 | 6.000 | 0.987 | 0.552 | 0.959 | 0.101 | 2.165 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.748 | 0.726 | 0.770 | 1.000 | 0.044 | 338.000 | 2.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.747 | 0.728 | 0.767 | 7.000 | 0.010 | 1,105.000 | 11.000 | 1.000 | 0.854 | 0.835 | 0.942 | 1.319 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 0.745 | 0.723 | 0.768 | 4.000 | 0.024 | 694.000 | 3.000 | 0.907 | 0.871 | 0.909 | 0.000 | 3.453 | SimpleImputer() | CustomOrdinalEncoder() |
| 0.744 | 0.723 | 0.766 | 6.000 | 0.019 | 972.000 | 4.000 | 0.779 | 0.582 | 0.970 | 0.014 | 2.970 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.741 | 0.717 | 0.766 | 1.000 | 0.010 | 930.000 | 2.000 | 1.000 | 1.000 | 1.000 | 0.000 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.740 | 0.716 | 0.765 | 1.000 | 0.010 | 1,463.000 | 10.000 | 1.000 | 0.851 | 1.000 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.735 | 0.716 | 0.755 | 1.000 | 0.191 | 1,800.000 | 12.000 | 1.000 | 0.907 | 0.788 | 0.000 | 1.280 | SimpleImputer() | OneHotEncoder() |
| 0.734 | 0.708 | 0.761 | 1.000 | 0.010 | 100.000 | 1.000 | 0.592 | 0.500 | 1.000 | 0.000 | 1.944 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.733 | 0.712 | 0.754 | 10.000 | 0.033 | 2,000.000 | 1.000 | 0.673 | 1.000 | 0.500 | 0.000 | 1.000 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.733 | 0.709 | 0.757 | 2.000 | 0.057 | 1,305.000 | 17.000 | 0.869 | 0.743 | 0.679 | 0.001 | 3.446 | SimpleImputer() | OneHotEncoder() |
| 0.732 | 0.708 | 0.755 | 10.000 | 0.026 | 100.000 | 50.000 | 1.000 | 0.500 | 0.500 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.730 | 0.713 | 0.747 | 1.000 | 0.300 | 2,000.000 | 2.000 | 1.000 | 0.500 | 1.000 | 0.000 | 4.000 | SimpleImputer() | OneHotEncoder() |
| 0.728 | 0.703 | 0.753 | 1.000 | 0.010 | 217.000 | 15.000 | 0.809 | 0.500 | 1.000 | 1.000 | 1.908 | SimpleImputer() | CustomOrdinalEncoder() |
| 0.725 | 0.700 | 0.749 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | OneHotEncoder() |
| 0.724 | 0.703 | 0.746 | 1.000 | 0.010 | 100.000 | 17.000 | 0.984 | 0.803 | 0.500 | 1.000 | 4.000 | SimpleImputer() | OneHotEncoder() |
| 0.721 | 0.697 | 0.744 | 7.000 | 0.254 | 363.000 | 1.000 | 0.977 | 0.652 | 0.942 | 0.003 | 2.414 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.720 | 0.697 | 0.743 | 10.000 | 0.023 | 2,000.000 | 10.000 | 0.846 | 1.000 | 0.788 | 0.000 | 1.000 | SimpleImputer() | OneHotEncoder() |
| 0.718 | 0.699 | 0.737 | 3.000 | 0.153 | 1,536.000 | 1.000 | 0.693 | 0.799 | 0.586 | 0.309 | 3.540 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.718 | 0.696 | 0.740 | 1.000 | 0.010 | 100.000 | 4.000 | 1.000 | 1.000 | 0.500 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | OneHotEncoder() |
| 0.717 | 0.691 | 0.742 | 1.000 | 0.010 | 2,000.000 | 50.000 | 0.918 | 0.500 | 0.645 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.715 | 0.687 | 0.742 | 8.000 | 0.114 | 906.000 | 17.000 | 0.830 | 0.763 | 0.719 | 0.040 | 1.631 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.712 | 0.691 | 0.733 | 7.000 | 0.085 | 2,000.000 | 7.000 | 0.674 | 0.979 | 0.500 | 1.000 | 4.000 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 0.710 | 0.692 | 0.729 | 1.000 | 0.300 | 2,000.000 | 1.000 | 0.500 | 1.000 | 1.000 | 0.000 | 4.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.710 | 0.689 | 0.731 | 2.000 | 0.173 | 1,184.000 | 1.000 | 0.913 | 0.748 | 0.960 | 0.000 | 1.707 | SimpleImputer(strategy='median') | OneHotEncoder() |
| 0.703 | 0.675 | 0.730 | 4.000 | 0.196 | 1,333.000 | 10.000 | 0.817 | 0.680 | 0.888 | 0.005 | 2.766 | SimpleImputer(strategy='median') | CustomOrdinalEncoder() |
| 0.691 | 0.660 | 0.721 | 3.000 | 0.300 | 1,661.000 | 3.000 | 0.500 | 0.724 | 0.638 | 1.000 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.681 | 0.651 | 0.710 | 1.000 | 0.051 | 2,000.000 | 50.000 | 1.000 | 0.500 | 1.000 | 0.000 | 4.000 | SimpleImputer() | OneHotEncoder() |
| 0.500 | <NA> | <NA> | 10.000 | 0.042 | 1,069.000 | 50.000 | 0.500 | 1.000 | 1.000 | 0.108 | 1.000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 0.500 | <NA> | <NA> | 2.000 | 0.010 | 100.000 | 50.000 | 0.566 | 0.636 | 0.500 | 0.004 | 1.777 | SimpleImputer(strategy='median') | OneHotEncoder() |
results.plot_performance_across_trials().show()
results.plot_performance_across_trials(query="`roc_auc Mean` > 0.5").show()
results.plot_performance_across_trials(size='learning_rate', color='max_depth').show()
results.plot_performance_across_trials(size='learning_rate', color='encoder').show()
results.plot_parameter_values_across_trials().show()
results.plot_scatter_matrix(height=1000, width=1000 * hlp.plot.GOLDEN_RATIO).show()
results.plot_performance_numeric_params(height=800)
results.plot_parallel_coordinates().show()
results.plot_performance_non_numeric_params()
results.plot_score_vs_parameter(
parameter='learning_rate',
size='colsample_bytree',
color='encoder'
)
/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/statsmodels/nonparametric/smoothers_lowess.py:227: RuntimeWarning: invalid value encountered in true_divide
results.plot_parameter_vs_parameter(
parameter_x='colsample_bytree',
parameter_y='learning_rate',
size='max_depth'
)
results.plot_parameter_vs_parameter(
parameter_x='colsample_bytree',
parameter_y='learning_rate',
size='imputer'
)
roc_auc Mean¶score_variable = results.primary_score_name + ' Mean'
score_dataframe = results.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
if x not in [score_variable] + results.parameter_names])
score_dataframe.head()
| roc_auc Mean | max_depth | learning_rate | n_estimators | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | imputer | encoder | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25 | 0.766045 | 10.0 | 0.01 | 100.0 | 1.0 | 0.500000 | 0.500000 | 0.5 | 1.000000 | 4.000000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 35 | 0.764225 | 1.0 | 0.01 | 2000.0 | 3.0 | 0.500000 | 1.000000 | 1.0 | 0.000100 | 1.973507 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 26 | 0.763043 | 10.0 | 0.01 | 100.0 | 1.0 | 0.500000 | 0.500000 | 0.5 | 1.000000 | 4.000000 | SimpleImputer() | OneHotEncoder() |
| 39 | 0.762612 | 5.0 | 0.01 | 538.0 | 2.0 | 0.702231 | 0.972685 | 0.5 | 1.000000 | 1.000000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 19 | 0.762189 | 1.0 | 0.01 | 1275.0 | 1.0 | 0.500000 | 1.000000 | 1.0 | 0.003357 | 1.000000 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names
{'roc_auc Mean': 'roc_auc_Mean',
'max_depth': 'max_depth',
'learning_rate': 'learning_rate',
'n_estimators': 'n_estimators',
'min_child_weight': 'min_child_weight',
'subsample': 'subsample',
'colsample_bytree': 'colsample_bytree',
'colsample_bylevel': 'colsample_bylevel',
'reg_alpha': 'reg_alpha',
'reg_lambda': 'reg_lambda',
'imputer': 'imputer',
'encoder': 'encoder'}
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)
import statsmodels.formula.api as smf
y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")
formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ max_depth + learning_rate + n_estimators + min_child_weight + subsample + colsample_bytree + colsample_bylevel + reg_alpha + reg_lambda + imputer + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.711
Model: OLS Adj. R-squared: 0.617
Method: Least Squares F-statistic: 7.581
Date: Tue, 15 Feb 2022 Prob (F-statistic): 8.19e-07
Time: 10:07:58 Log-Likelihood: 108.52
No. Observations: 50 AIC: -191.0
Df Residuals: 37 BIC: -166.2
Df Model: 12
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept 0.7490 0.044 17.205 0.000 0.661 0.837
imputer[T.SimpleImputer(strategy='median')] -0.0266 0.014 -1.886 0.067 -0.055 0.002
imputer[T.SimpleImputer(strategy='most_frequent')] -0.0052 0.015 -0.356 0.724 -0.035 0.024
encoder[T.OneHotEncoder()] -0.0164 0.012 -1.392 0.172 -0.040 0.007
max_depth -0.0005 0.001 -0.318 0.753 -0.003 0.002
learning_rate -0.1734 0.062 -2.783 0.008 -0.300 -0.047
n_estimators 1.539e-05 7.29e-06 2.112 0.041 6.28e-07 3.02e-05
min_child_weight -0.0029 0.000 -8.171 0.000 -0.004 -0.002
subsample 0.0868 0.028 3.117 0.004 0.030 0.143
colsample_bytree -0.0512 0.030 -1.700 0.098 -0.112 0.010
colsample_bylevel -0.0208 0.025 -0.848 0.402 -0.071 0.029
reg_alpha 0.0119 0.011 1.057 0.297 -0.011 0.035
reg_lambda 0.0016 0.005 0.342 0.734 -0.008 0.011
==============================================================================
Omnibus: 3.177 Durbin-Watson: 1.630
Prob(Omnibus): 0.204 Jarque-Bera (JB): 2.758
Skew: -0.011 Prob(JB): 0.252
Kurtosis: 4.150 Cond. No. 1.79e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.79e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)
numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)
print(numeric_columns)
print(non_numeric_columns)
numeric_pipeline = Pipeline([
('scaling', StandardScaler()),
])
transformations_pipeline = ColumnTransformer([
('numeric_pipeline', numeric_pipeline, numeric_columns),
('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])
score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()
['roc_auc_Mean', 'max_depth', 'learning_rate', 'n_estimators', 'min_child_weight', 'subsample', 'colsample_bytree', 'colsample_bylevel', 'reg_alpha', 'reg_lambda'] ['imputer', 'encoder']
| roc_auc_Mean | max_depth | learning_rate | n_estimators | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | imputer | encoder | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.707014 | 1.626343 | -0.571176 | -1.274616 | -0.57359 | -1.273526 | -1.45366 | -1.184238 | 1.040882 | 1.386193 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 1 | 0.671243 | -0.880008 | -0.571176 | 1.27784 | -0.43403 | -1.273526 | 1.072117 | 1.100289 | -1.008936 | -0.240214 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 2 | 0.647998 | 1.626343 | -0.571176 | -1.274616 | -0.57359 | -1.273526 | -1.45366 | -1.184238 | 1.040882 | 1.386193 | SimpleImputer() | OneHotEncoder() |
| 3 | 0.639529 | 0.233926 | -0.571176 | -0.686207 | -0.50381 | -0.270451 | 0.934133 | -1.184238 | 1.040882 | -1.021523 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
| 4 | 0.631203 | -0.880008 | -0.571176 | 0.303877 | -0.57359 | -1.273526 | 1.072117 | 1.100289 | -1.002259 | -1.021523 | SimpleImputer(strategy='most_frequent') | CustomOrdinalEncoder() |
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['max_depth'] = score_dataframe_transformed['max_depth'].astype('float')
score_dataframe_transformed['learning_rate'] = score_dataframe_transformed['learning_rate'].astype('float')
score_dataframe_transformed['n_estimators'] = score_dataframe_transformed['n_estimators'].astype('float')
score_dataframe_transformed['min_child_weight'] = score_dataframe_transformed['min_child_weight'].astype('float')
score_dataframe_transformed['subsample'] = score_dataframe_transformed['subsample'].astype('float')
score_dataframe_transformed['colsample_bytree'] = score_dataframe_transformed['colsample_bytree'].astype('float')
score_dataframe_transformed['colsample_bylevel'] = score_dataframe_transformed['colsample_bylevel'].astype('float')
score_dataframe_transformed['reg_alpha'] = score_dataframe_transformed['reg_alpha'].astype('float')
score_dataframe_transformed['reg_lambda'] = score_dataframe_transformed['reg_lambda'].astype('float')
print(formula)
model = smf.ols(formula=formula,
data = score_dataframe_transformed)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ max_depth + learning_rate + n_estimators + min_child_weight + subsample + colsample_bytree + colsample_bylevel + reg_alpha + reg_lambda + imputer + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.711
Model: OLS Adj. R-squared: 0.617
Method: Least Squares F-statistic: 7.581
Date: Tue, 15 Feb 2022 Prob (F-statistic): 8.19e-07
Time: 10:07:58 Log-Likelihood: -40.413
No. Observations: 50 AIC: 106.8
Df Residuals: 37 BIC: 131.7
Df Model: 12
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept 0.2963 0.259 1.146 0.259 -0.228 0.820
imputer[T.SimpleImputer(strategy='median')] -0.5223 0.277 -1.886 0.067 -1.083 0.039
imputer[T.SimpleImputer(strategy='most_frequent')] -0.1015 0.285 -0.356 0.724 -0.679 0.476
encoder[T.OneHotEncoder()] -0.3220 0.231 -1.392 0.172 -0.791 0.147
max_depth -0.0320 0.101 -0.318 0.753 -0.236 0.172
learning_rate -0.2920 0.105 -2.783 0.008 -0.505 -0.079
n_estimators 0.2253 0.107 2.112 0.041 0.009 0.441
min_child_weight -0.8203 0.100 -8.171 0.000 -1.024 -0.617
subsample 0.3441 0.110 3.117 0.004 0.120 0.568
colsample_bytree -0.1994 0.117 -1.700 0.098 -0.437 0.038
colsample_bylevel -0.0896 0.106 -0.848 0.402 -0.304 0.125
reg_alpha 0.1140 0.108 1.057 0.297 -0.104 0.332
reg_lambda 0.0386 0.113 0.342 0.734 -0.190 0.267
==============================================================================
Omnibus: 3.177 Durbin-Watson: 1.630
Prob(Omnibus): 0.204 Jarque-Bera (JB): 2.758
Skew: -0.011 Prob(JB): 0.252
Kurtosis: 4.150 Cond. No. 7.34
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
coefficients = pd.DataFrame({
'feature': results.params.index,
'coefficient': results.params,
'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients
| feature | coefficient | p_value | Stat Sig | |
|---|---|---|---|---|
| imputer[T.SimpleImputer(strategy='median')] | imputer[T.SimpleImputer(strategy='median')] | -0.522286 | 6.713280e-02 | False |
| imputer[T.SimpleImputer(strategy='most_frequent')] | imputer[T.SimpleImputer(strategy='most_frequen... | -0.101521 | 7.238572e-01 | False |
| encoder[T.OneHotEncoder()] | encoder[T.OneHotEncoder()] | -0.321980 | 1.721872e-01 | False |
| max_depth | max_depth | -0.031958 | 7.526281e-01 | False |
| learning_rate | learning_rate | -0.291992 | 8.430626e-03 | True |
| n_estimators | n_estimators | 0.225282 | 4.146531e-02 | True |
| min_child_weight | min_child_weight | -0.820321 | 8.298450e-10 | True |
| subsample | subsample | 0.344126 | 3.521606e-03 | True |
| colsample_bytree | colsample_bytree | -0.199354 | 9.760000e-02 | False |
| colsample_bylevel | colsample_bylevel | -0.089645 | 4.019555e-01 | False |
| reg_alpha | reg_alpha | 0.114008 | 2.971395e-01 | False |
| reg_lambda | reg_lambda | 0.038627 | 7.342668e-01 | False |
score_variable
'roc_auc Mean'
px.bar(
data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
y='feature',
x='coefficient',
color='Stat Sig',
title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
from sklearn.inspection import permutation_importance
estimator = bayes_search.best_estimator_
start_time = time.time()
result = permutation_importance(
estimator, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 4.638 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})
| default | |
|---|---|
| foreign_worker | |
| yes | 0.308290 |
| no | 0.107143 |
fig = px.box(
data_frame=temp,
y='age',
x='default',
# size=size_variable,
# color=color_variable,
# trendline='lowess',
# labels={
# score_variable: f"Average Cross Validation Score ({results.primary_score_name})",
# },
# title=f"<b>{x_variable}</b> - Performance<br>" \
# f"<sup>Size of point corresponds to '{size_variable}'</sup>",
# custom_data=['labels'],
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()
NOTE: foreign worker seems like it should be important but is ranked last in feature importance.